#!/usr/bin/env python
"""xgboost shap and lime"""
# parameter
MODEL = "onTravelV6C"
N_SAMPLES = 500
TRAIN_DATA_FILE = "train_" + MODEL + ".txt"
SAMPLE_FILE = "sample_train_" + MODEL + ".txt"
FEATURE_MAP_FILE = "feature_map_" + MODEL + ".json"
MODEL_FILE = MODEL + ".bin"
SAMPLE_FILE = "sample_" + str(N_SAMPLES) + "_" + TRAIN_DATA_FILE
%%bash
# prepare
# parameter
MODEL="onTravelV6C"
N_SAMPLES=500
TRAIN_DATA_FILE="train_${MODEL}.txt"
SAMPLE_FILE="sample_train_${MODEL}.txt"
FEATURE_MAP_FILE="feature_map_${MODEL}.json"
MODEL_FILE="${MODEL}.bin"
SAMPLE_FILE="sample_${N_SAMPLES}_${TRAIN_DATA_FILE}"
# train data file
if [[ ! -f ${TRAIN_DATA_FILE} ]]; then
echo "Train Data File Not Exist"
echo "Copy File Begin"
cp /mfw_data/algo/wanglei/spark_offline/train_data/onTravel/${TRAIN_DATA_FILE} ./
echo "Copy File End"
fi
# feature map data file
if [[ ! -f ${FEATURE_MAP_FILE} ]]; then
echo "Feature Map File Not Exist"
echo "Get File Begin"
hadoop fs -text /user/wanglei3/featureMap/onTravel/${MODEL}/part-00000.snappy > ${FEATURE_MAP_FILE}
echo "Get File End"
fi
# xgboost model file
if [[ ! -f ${MODEL_FILE} ]]; then
echo "Model File Not Exist"
echo "Copy File Begin"
cp /opt/tomcat/webapps/model/${MODEL} ./
mv ${MODEL} ${MODEL}.bin
echo "Copy File End"
fi
# random sampling
if [[ ! -f ${SAMPLE_FILE} ]]; then
echo "Sample File Not Exist"
echo "Sampling Begin"
shuf -n ${N_SAMPLES} ${TRAIN_DATA_FILE} -o sample_${N_SAMPLES}_${TRAIN_DATA_FILE}
echo "Sampling End"
fi
ls
# ipython core option
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# package
from sklearn.datasets import load_svmlight_file
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use("seaborn")
import shap
import lime
import json
import re
# feature map
with open(FEATURE_MAP_FILE) as fp:
feature_map = json.load(fp)
cols = []
i = 0
for fm in feature_map:
if i == 0:
pass
else:
print(fm)
cols.append(re.search(r"\t(.*)\t", fm).group(1))
i += 1
# load libsvm format file
X, y = load_svmlight_file(SAMPLE_FILE, n_features=len(cols))
print(X[0].todense().shape)
print(y[0])
# create dataframe
df = pd.DataFrame(X.todense())
df.columns = cols
df["repair"] = np.zeros(N_SAMPLES)
df["label"] = y
df = df[["repair"]+cols+["label"]]
df.head()
IS_TRAIN = False
# train xgboost model
if IS_TRAIN:
# from sklearn.ensemble import GradientBoostingClassifier
# param = {
# "loss": "deviance",
# "learning_rate": 0.1,
# "max_depth": 7,
# "subsample": 0.8,
# "n_estimators": 300
# }
# sk_gbt = GradientBoostingClassifier(**param)
# sk_gbt.fit(df[["repair"]+cols], df["label"])
# param = {
# "objective": "binary:logistic",
# "learning_rate": 0.1,
# "max_depth": 7,
# "min_child_weight": 1,
# "gamma": 0,
# "subsample": 0.8,
# "colsample_bytree": 0.8,
# "scale_pos_weight": 1,
# "n_estimators": 300,
# }
# sk_xgb = xgb.XGBClassifier(**param)
# sk_xgb.fit(df[["repair"]+cols], df["label"])
param = {
"objective": "binary:logistic",
"eta": 0.1,
"max_depth": 7,
"min_child_weight": 1,
"gamma": 0,
"subsample": 0.8,
"colsample_bytree": 0.8,
"scale_pos_weight": 1,
"silent": True
}
num_boost_round = 300
dtrain = xgb.DMatrix(df[["repair"]+cols], label=df["label"])
bst_xgb = xgb.train(param, dtrain, num_boost_round=num_boost_round)
else:
bst = xgb.Booster(model_file=MODEL_FILE)
if IS_TRAIN == True:
model = bst_xgb
else:
model = bst
# margin or probability
MODEL_OUTPUT = "probability"
# shap
if MODEL_OUTPUT == "margin":
# margin explanation
shap_explainer = shap.TreeExplainer(model)
if MODEL_OUTPUT == "probability":
# probability explanation
BACKGROUND_DATASET_SIZE = 1000
if len(df[["repair"]+cols]) <= BACKGROUND_DATASET_SIZE:
background_dataset = df[["repair"]+cols]
else:
background_dataset = df[["repair"]+cols].sample(BACKGROUND_DATASET_SIZE)
shap_explainer = shap.TreeExplainer(model, background_dataset.values, model_output="probability", feature_dependence="independent")
shap_values = shap_explainer.shap_values(df[["repair"]+cols])
print("shap_values: ", shap_values.shape)
y_base = shap_explainer.expected_value
print("y_base: ", y_base)
if MODEL_OUTPUT == "margin":
# margin explanation
df["pred"] = model.predict(xgb.DMatrix(df[["repair"]+cols], label=df["label"]), output_margin=True)
if MODEL_OUTPUT == "probability":
# probability explanation
df["pred"] = model.predict(xgb.DMatrix(df[["repair"]+cols], label=df["label"]), output_margin=False)
print("pred mean: ", df["pred"].mean())
df.head()
shap.force_plot(shap_explainer.expected_value, shap_values, df[["repair"]+cols])
shap.summary_plot(shap_values, df[["repair"]+cols], plot_type="bar")
shap.summary_plot(shap_values, df[["repair"]+cols])
if MODEL_OUTPUT == "margin":
shap_interaction_values = shap_explainer.shap_interaction_values(df[["repair"]+cols])
shap.summary_plot(shap_interaction_values, df[["repair"]+cols], max_display=4)
# j = np.random.randint(N_SAMPLES)
i = np.random.choice(df[df["pred"] <= 0.5].index.tolist())
print("negative sample")
player_explainer = pd.DataFrame()
player_explainer['feature'] = ["repair"]+cols
player_explainer['feature_value'] = df[["repair"]+cols].iloc[i].values
player_explainer['shap_value'] = shap_values[i]
player_explainer
print("y_base + sum_of_shap_values: %.2f" % (y_base + player_explainer["shap_value"].sum()))
print("y_pred: %.2f" % (df["pred"].iloc[i]))
shap.initjs()
shap.force_plot(shap_explainer.expected_value, shap_values[i], df[["repair"]+cols].iloc[i])
j = np.random.choice(df[df["pred"] >= 0.5].index.tolist())
print("positive sample")
player_explainer = pd.DataFrame()
player_explainer['feature'] = ["repair"]+cols
player_explainer['feature_value'] = df[["repair"]+cols].iloc[j].values
player_explainer['shap_value'] = shap_values[j]
player_explainer
print("y_base + sum_of_shap_values: %.2f" % (y_base + player_explainer["shap_value"].sum()))
print("y_pred: %.2f" % (df["pred"].iloc[j]))
shap.initjs()
shap.force_plot(shap_explainer.expected_value, shap_values[j], df[["repair"]+cols].iloc[j])
FEATURE="doubleFlow_article_ctr_30_v1"
INTERACTION="doubleFlow_user_view_30"
shap.dependence_plot(FEATURE, shap_values, df[["repair"]+cols], interaction_index=None, show=False)
shap.dependence_plot(FEATURE, shap_values, df[["repair"]+cols], interaction_index=INTERACTION, show=False)
# lime
lime_explainer = lime.lime_tabular.LimeTabularExplainer(df[["repair"]+cols].values,
feature_names=["repair"]+cols,
class_names=["0", "1"],
verbose=True)
model.feature_names = None
def predict_fn(x):
preds = model.predict(xgb.DMatrix(x))
return np.array([[1-p, p] for p in preds])
i = np.random.choice(df[df["pred"] <= 0.5].index.tolist())
print("negative sample")
player_explainer = pd.DataFrame()
player_explainer['feature'] = ["repair"]+cols
player_explainer['feature_value'] = df[["repair"]+cols].iloc[i].values
player_explainer['shap_value'] = shap_values[i]
player_explainer
exp = lime_explainer.explain_instance(df[["repair"]+cols].values[i], predict_fn, num_features=5)
exp.show_in_notebook(show_table=True)
exp.as_list()
fig = exp.as_pyplot_figure()
fig.show()
j = np.random.choice(df[df["pred"] >= 0.5].index.tolist())
print("positive sample")
player_explainer = pd.DataFrame()
player_explainer['feature'] = ["repair"]+cols
player_explainer['feature_value'] = df[["repair"]+cols].iloc[j].values
player_explainer['shap_value'] = shap_values[j]
player_explainer
exp = lime_explainer.explain_instance(df[["repair"]+cols].values[j], predict_fn, num_features=5)
exp.show_in_notebook(show_table=True)
exp.as_list()
fig = exp.as_pyplot_figure()
fig.show()